Challenge 02 Pandas Chris Buie.

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import plotly
import cufflinks as cf
import pandoc
plotly.offline.init_notebook_mode()
%matplotlib inline

from plotly.offline import init_notebook_mode, iplot
init_notebook_mode()

import os
os.chdir('/Users/cbuie/PycharmProjects/sf16_ds4/challenges/02-pandas')
//anaconda/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')
In [13]:
data = pd.read_csv('2013_Movies.csv')
data.info()
data['ReleaseDate'] = pd.to_datetime(data['ReleaseDate'], infer_datetime_format=True)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
Title                 100 non-null object
Budget                89 non-null float64
DomesticTotalGross    100 non-null int64
Director              96 non-null object
Rating                100 non-null object
Runtime               100 non-null int64
ReleaseDate           100 non-null object
dtypes: float64(1), int64(2), object(4)
memory usage: 5.5+ KB

Exercise 2.1

Plot domestic total gross over time.

In [14]:
##1rst way with sns
# sns.swarmplot(x="ReleaseDate", y="DomesticTotalGross", data=data[['ReleaseDate','DomesticTotalGross']], size=10)

##2rd way with pyplot
# x = np.array(data['ReleaseDate'])
# y = np.array(data['DomesticTotalGross'])
# plt.pyplot.figure(figsize=(12,6))
# plt.pyplot.scatter(x,y,alpha=0.8,marker='o')

##3rd way with plotly
fig = {
    'data': [
  		{
  			'x': data['ReleaseDate'],
        	'y': data['DomesticTotalGross'],  
        	'mode': 'markers', 
        	'name': '2013'
        }
    ],
    'layout': {
        'title':{'title': 'DTG vs Release Date'},
        'xaxis': {'title': 'Date'},
        'yaxis': {'title': "Domestic Total Gross"}
        
    }
}

plotly.offline.iplot(fig )

Exercise 2.2

Plot runtime vs. domestic total gross.

In [15]:
fig = {
    'data': [
  		{
  			'x': data['DomesticTotalGross'],
        	'y': data['Runtime'],  
        	'mode': 'markers',  
        	'name': '2013'
        }
    ],
    'layout': {
        'title':{'title': 'Runtime vs. DTG (m)'},
        'xaxis': {'title': 'DomesticTotalGross (log scale)',  'type': 'log'},
        'yaxis': {'title': "Runtime"}
        
    }
}

plotly.offline.iplot(fig, filename='pandas/multiple-scatter' , )

Exercise 2.3

Group your data by Rating and find the average runtime and domestic total gross at each level of Rating.

In [16]:
by_rating_df = data.groupby(['Rating'])['Runtime','DomesticTotalGross'].mean()
print by_rating_df
           Runtime  DomesticTotalGross
Rating                                
G       107.000000        2.684928e+08
PG       99.933333        1.311357e+08
PG-13   117.510638        1.114498e+08
R       110.729730        6.989243e+07

Exercise 2.4

Make one figure with (N=the number of MPAA ratings there are) subplots, and in each plot the release date vs the domestic total gross.

In [17]:
by_rating_df2 = data.groupby(['Rating','ReleaseDate'])['DomesticTotalGross'].agg(np.mean).reset_index()
by_rating_df2.set_index('ReleaseDate')
by_rating_df2['DomesticTotalGross'] = by_rating_df2['DomesticTotalGross']/1000000

by_rating_df2 = by_rating_df2.pivot(index='ReleaseDate', columns='Rating', values='DomesticTotalGross')

by_rating_df2.iplot(subplots=True, online=False,subplot_titles = True, kind='scatter',mode='markers', size='6',
                   title='Domestic Total Gross vs. Release Date',theme='white')

Exercise 2.5

What director in your dataset has the highest gross per movie?

Alfonso Cuaron has the highest Average DTG: 274092705 (*only one movie)

In [18]:
by_Director_df = data.groupby(['Director'])['DomesticTotalGross'].agg([np.mean,len]).reset_index()

print by_Director_df.head(10).sort_values('mean', ascending = False)
                     Director       mean  len
2              Alfonso Cuaron  274092705    1
1                 Alan Taylor  206362140    1
8                Baz Luhrmann  144840419    1
0                  Adam McKay  125168368    1
5               Antoine Fuqua   98925640    1
6           Baltasar Kormakur   75612460    1
3           Andres Muschietti   71628180    1
9                 Ben Stiller   58236838    1
7  Barry CookNeil Nightingale   36076121    1
4               Andrew Niccol   26627201    1

Exercise 2.6

Bin your dataset into months and make a bar graph of the mean domestic total gross by month. Error bars will represent the standard error of the mean.

Title of graph should include: Mean Domestic Total Gross by Month in 2013

Topic for consideration: what is the correct formula for the standard error of the mean? Examine the error bars and see if they are "reasonable."

CB notes:

The standard error of the mean (SE of the mean) estimates the variability between sample means that you would obtain if you took multiple samples from the same population. The standard error of the mean estimates the variability between samples whereas the standard deviation measures the variability within a single sample

The standard error can be calculated by taking the std/sqrt(n).

Use the standard error of the mean to determine how precisely the mean of the sample estimates the population mean. Lower values of the standard error of the mean indicate more precise estimates of the population mean. Usually, a larger standard deviation will result in a larger standard error of the mean and a less precise estimate. A larger sample size will result in a smaller standard error of the mean and a more precise estimate.

For example, you have a mean delivery time of 3.80 days with a standard deviation of 1.43 days based on a random sample of 312 delivery times. These numbers yield a standard error of the mean of 0.08 days (1.43 divided by the square root of 312). Had you taken multiple random samples of the same size and from the same population the standard deviation of those different sample means would be around 0.08 days.

In [19]:
from scipy import stats

data['month'] = pd.DatetimeIndex(data['ReleaseDate']).month
by_month_df = data.groupby(['month'])['DomesticTotalGross'].agg([np.mean,np.std,len, stats.sem])
by_month_df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 12 entries, 1 to 12
Data columns (total 4 columns):
mean    12 non-null int64
std     12 non-null float64
len     12 non-null int64
sem     12 non-null int64
dtypes: float64(1), int64(3)
memory usage: 480.0 bytes
In [20]:
import plotly.plotly as py
import plotly.graph_objs as go

x = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sept','Oct','Nov','Dec']

# x = by_month_df.index
y = by_month_df['mean']
y1 = by_month_df['len']
e = by_month_df['sem']


trace1 = go.Bar(
    x = x,
    y = y,
    
    
    error_y=dict(
    type='data',
    array=e
        ),
 
    
    name='Mean GTD (m)',
        marker=dict(
        color='rgba(50, 171, 96, 0.6)',
        line=dict(
            color='rgba(50, 171, 96, 1.0)',
            width=2)
    )
)

# trace2 = go.Scatter(
#     x=x,
#     y=y1,
#     name='Monthly Count',
#     marker=dict(color = 'rgb(148, 103, 189)'),
#     yaxis='y2'
# )

data = [trace1]
layout = go.Layout(
    title='Mean Domestic Total Gross by Month in 2013',
    xaxis=dict(
        tickfont=dict(
            size=14,
            color='rgb(107, 107, 107)'
        )
    ),
    yaxis=dict(
        title='Gross Domestic Profit (millions)',
        titlefont=dict(
            size=12,
            color='rgb(107, 107, 107)'
        ),
        tickfont=dict(
            size=12,
            color='rgb(107, 107, 107)'
        )
    ),
#         yaxis2=dict(
#         title='Monthly Count',
#         titlefont=dict(
#             color='rgb(148, 103, 189)'
#         ),
#         tickfont=dict(
#             color='rgb(148, 103, 189)'
#         ),
#         overlaying='y',
#         side='right'
    )
#    ) 
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig,)
In [10]:
# import plotly.plotly as py
# from plotly.graph_objs import *
# import plotly.graph_objs as go
# import pandas as pd

# by_month_df

# months = by_month_df.index
# DGP = by_month_df['mean']
# stderr = by_month_df['sem']


# data = [

#         go.Bar(
#             x = months,
#             y = DGP,

#             error_y=dict(
#                 type='data',
#                 array=stderr
          
#             ),

#         )

# ]

# layout = go.Layout( xaxis=XAxis(type='category') )
# fig = Figure( data=data, layout=layout)


# plotly.offline.iplot(fig)
In [ ]: